{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0a18b6dd",
   "metadata": {},
   "source": [
    "## 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d681d5fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/macbook/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.1' currently installed).\n",
      "  from pandas.core.computation.check import NUMEXPR_INSTALLED\n",
      "/Users/macbook/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.4' currently installed).\n",
      "  from pandas.core import (\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ï»¿Quality</th>\n",
       "      <th>#1 ID</th>\n",
       "      <th>#2 ID</th>\n",
       "      <th>#1 String</th>\n",
       "      <th>#2 String</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>702876</td>\n",
       "      <td>702977</td>\n",
       "      <td>Amrozi accused his brother , whom he called &lt;Q...</td>\n",
       "      <td>Referring to him as only &lt;QUOTE&gt; the witness &lt;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2108705</td>\n",
       "      <td>2108831</td>\n",
       "      <td>Yucaipa owned Dominick 's before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick 's in 1995 for $ 693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1330381</td>\n",
       "      <td>1330521</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 , the ship 's owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>3344667</td>\n",
       "      <td>3344648</td>\n",
       "      <td>Around 0335 GMT , Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents , or 4.6 % , to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1236820</td>\n",
       "      <td>1236712</td>\n",
       "      <td>The stock rose $ 2.11 , or about 11 percent , ...</td>\n",
       "      <td>PG &amp; E Corp. shares jumped $ 1.63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>2685984</td>\n",
       "      <td>2686122</td>\n",
       "      <td>After Hughes refused to rehire Hernandez , he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>339215</td>\n",
       "      <td>339172</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>2996850</td>\n",
       "      <td>2996734</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany , who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>2095781</td>\n",
       "      <td>2095812</td>\n",
       "      <td>Last week the power station â s US owners , ...</td>\n",
       "      <td>The news comes after Drax 's American owner , ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>2136244</td>\n",
       "      <td>2136052</td>\n",
       "      <td>Sobig.F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      ï»¿Quality    #1 ID    #2 ID  \\\n",
       "0              1   702876   702977   \n",
       "1              0  2108705  2108831   \n",
       "2              1  1330381  1330521   \n",
       "3              0  3344667  3344648   \n",
       "4              1  1236820  1236712   \n",
       "...          ...      ...      ...   \n",
       "5796           0  2685984  2686122   \n",
       "5797           0   339215   339172   \n",
       "5798           0  2996850  2996734   \n",
       "5799           1  2095781  2095812   \n",
       "5800           1  2136244  2136052   \n",
       "\n",
       "                                              #1 String  \\\n",
       "0     Amrozi accused his brother , whom he called <Q...   \n",
       "1     Yucaipa owned Dominick 's before selling the c...   \n",
       "2     They had published an advertisement on the Int...   \n",
       "3     Around 0335 GMT , Tab shares were up 19 cents ...   \n",
       "4     The stock rose $ 2.11 , or about 11 percent , ...   \n",
       "...                                                 ...   \n",
       "5796  After Hughes refused to rehire Hernandez , he ...   \n",
       "5797  There are 103 Democrats in the Assembly and 47...   \n",
       "5798  Bethany Hamilton remained in stable condition ...   \n",
       "5799  Last week the power station â s US owners , ...   \n",
       "5800  Sobig.F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                              #2 String  \n",
       "0     Referring to him as only <QUOTE> the witness <...  \n",
       "1     Yucaipa bought Dominick 's in 1995 for $ 693 m...  \n",
       "2     On June 10 , the ship 's owners had published ...  \n",
       "3     Tab shares jumped 20 cents , or 4.6 % , to set...  \n",
       "4     PG & E Corp. shares jumped $ 1.63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany , who remained in stable condition aft...  \n",
       "5799  The news comes after Drax 's American owner , ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 5 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第14章/读取原数据文件\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "data = pd.read_csv('data/msr_paraphrase.csv', sep='\\t')\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e885bbd5",
   "metadata": {},
   "source": [
    "## 删除无用的列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c59921b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ï»¿Quality</th>\n",
       "      <th>#1 String</th>\n",
       "      <th>#2 String</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother , whom he called &lt;Q...</td>\n",
       "      <td>Referring to him as only &lt;QUOTE&gt; the witness &lt;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick 's before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick 's in 1995 for $ 693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 , the ship 's owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT , Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents , or 4.6 % , to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose $ 2.11 , or about 11 percent , ...</td>\n",
       "      <td>PG &amp; E Corp. shares jumped $ 1.63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez , he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany , who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station â s US owners , ...</td>\n",
       "      <td>The news comes after Drax 's American owner , ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig.F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      ï»¿Quality                                          #1 String  \\\n",
       "0              1  Amrozi accused his brother , whom he called <Q...   \n",
       "1              0  Yucaipa owned Dominick 's before selling the c...   \n",
       "2              1  They had published an advertisement on the Int...   \n",
       "3              0  Around 0335 GMT , Tab shares were up 19 cents ...   \n",
       "4              1  The stock rose $ 2.11 , or about 11 percent , ...   \n",
       "...          ...                                                ...   \n",
       "5796           0  After Hughes refused to rehire Hernandez , he ...   \n",
       "5797           0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798           0  Bethany Hamilton remained in stable condition ...   \n",
       "5799           1  Last week the power station â s US owners , ...   \n",
       "5800           1  Sobig.F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                              #2 String  \n",
       "0     Referring to him as only <QUOTE> the witness <...  \n",
       "1     Yucaipa bought Dominick 's in 1995 for $ 693 m...  \n",
       "2     On June 10 , the ship 's owners had published ...  \n",
       "3     Tab shares jumped 20 cents , or 4.6 % , to set...  \n",
       "4     PG & E Corp. shares jumped $ 1.63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany , who remained in stable condition aft...  \n",
       "5799  The news comes after Drax 's American owner , ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/删除无用的两列数据\n",
    "data.pop('#1 ID')\n",
    "data.pop('#2 ID')\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d9a7218",
   "metadata": {},
   "source": [
    "## 重命名列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "97f44d5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother , whom he called &lt;Q...</td>\n",
       "      <td>Referring to him as only &lt;QUOTE&gt; the witness &lt;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick 's before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick 's in 1995 for $ 693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 , the ship 's owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT , Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents , or 4.6 % , to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose $ 2.11 , or about 11 percent , ...</td>\n",
       "      <td>PG &amp; E Corp. shares jumped $ 1.63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez , he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany , who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station â s US owners , ...</td>\n",
       "      <td>The news comes after Drax 's American owner , ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig.F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother , whom he called <Q...   \n",
       "1        0  Yucaipa owned Dominick 's before selling the c...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT , Tab shares were up 19 cents ...   \n",
       "4        1  The stock rose $ 2.11 , or about 11 percent , ...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez , he ...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station â s US owners , ...   \n",
       "5800     1  Sobig.F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only <QUOTE> the witness <...  \n",
       "1     Yucaipa bought Dominick 's in 1995 for $ 693 m...  \n",
       "2     On June 10 , the ship 's owners had published ...  \n",
       "3     Tab shares jumped 20 cents , or 4.6 % , to set...  \n",
       "4     PG & E Corp. shares jumped $ 1.63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany , who remained in stable condition aft...  \n",
       "5799  The news comes after Drax 's American owner , ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/重命名列\n",
    "columns = list(data.columns)\n",
    "columns[0] = 'same'\n",
    "columns[1] = 's1'\n",
    "columns[2] = 's2'\n",
    "data.columns = columns\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a34df6bb",
   "metadata": {},
   "source": [
    "## 删除特殊符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b98d30f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother , whom he called   ...</td>\n",
       "      <td>Referring to him as only   the witness   , Amr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick 's before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick 's in 1995 for $ 693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 , the ship 's owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT , Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents , or 4.6 % , to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose $ 2.11 , or about 11 percent , ...</td>\n",
       "      <td>PG &amp; E Corp. shares jumped $ 1.63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez , he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany , who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station â s US owners , ...</td>\n",
       "      <td>The news comes after Drax 's American owner , ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig.F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother , whom he called   ...   \n",
       "1        0  Yucaipa owned Dominick 's before selling the c...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT , Tab shares were up 19 cents ...   \n",
       "4        1  The stock rose $ 2.11 , or about 11 percent , ...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez , he ...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station â s US owners , ...   \n",
       "5800     1  Sobig.F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only   the witness   , Amr...  \n",
       "1     Yucaipa bought Dominick 's in 1995 for $ 693 m...  \n",
       "2     On June 10 , the ship 's owners had published ...  \n",
       "3     Tab shares jumped 20 cents , or 4.6 % , to set...  \n",
       "4     PG & E Corp. shares jumped $ 1.63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany , who remained in stable condition aft...  \n",
       "5799  The news comes after Drax 's American owner , ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/删除文本中的<QUOTE>符号\n",
    "data['s1'] = data['s1'].str.replace('<QUOTE>', ' ')\n",
    "data['s2'] = data['s2'].str.replace('<QUOTE>', ' ')\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8bf7e15f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother   whom he called   ...</td>\n",
       "      <td>Referring to him as only   the witness     Amr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick  s before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick  s in 1995 for   693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10   the ship  s owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT   Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents   or 4 6     to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose   2 11   or about 11 percent   ...</td>\n",
       "      <td>PG   E Corp  shares jumped   1 63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez   he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany   who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station â   s US owners   ...</td>\n",
       "      <td>The news comes after Drax  s American owner   ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother   whom he called   ...   \n",
       "1        0  Yucaipa owned Dominick  s before selling the c...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT   Tab shares were up 19 cents ...   \n",
       "4        1  The stock rose   2 11   or about 11 percent   ...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez   he ...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station â   s US owners   ...   \n",
       "5800     1  Sobig F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only   the witness     Amr...  \n",
       "1     Yucaipa bought Dominick  s in 1995 for   693 m...  \n",
       "2     On June 10   the ship  s owners had published ...  \n",
       "3     Tab shares jumped 20 cents   or 4 6     to set...  \n",
       "4     PG   E Corp  shares jumped   1 63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany   who remained in stable condition aft...  \n",
       "5799  The news comes after Drax  s American owner   ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/删除标点符号\n",
    "\n",
    "def remove_punctuation_marks(x):\n",
    "    return re.sub(r'[^\\w\\s]', ' ', x)\n",
    "\n",
    "data['s1'] = data['s1'].apply(remove_punctuation_marks)\n",
    "data['s2'] = data['s2'].apply(remove_punctuation_marks)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "79ed5086",
   "metadata": {},
   "source": [
    "## 替换特殊符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "31407a7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother   whom he called   ...</td>\n",
       "      <td>Referring to him as only   the witness     Amr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick  s before selling the c...</td>\n",
       "      <td>Yucaipa bought Dominick  s in 1995 for   693 m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10   the ship  s owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT   Tab shares were up 19 cents ...</td>\n",
       "      <td>Tab shares jumped 20 cents   or 4 6     to set...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose   2 11   or about 11 percent   ...</td>\n",
       "      <td>PG   E Corp  shares jumped   1 63 or 8 percent...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez   he ...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany   who remained in stable condition aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station a   s US owners   ...</td>\n",
       "      <td>The news comes after Drax  s American owner   ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother   whom he called   ...   \n",
       "1        0  Yucaipa owned Dominick  s before selling the c...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT   Tab shares were up 19 cents ...   \n",
       "4        1  The stock rose   2 11   or about 11 percent   ...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez   he ...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station a   s US owners   ...   \n",
       "5800     1  Sobig F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only   the witness     Amr...  \n",
       "1     Yucaipa bought Dominick  s in 1995 for   693 m...  \n",
       "2     On June 10   the ship  s owners had published ...  \n",
       "3     Tab shares jumped 20 cents   or 4 6     to set...  \n",
       "4     PG   E Corp  shares jumped   1 63 or 8 percent...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany   who remained in stable condition aft...  \n",
       "5799  The news comes after Drax  s American owner   ...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/替换特殊字符\n",
    "data['s1'] = data['s1'].str.replace('â', 'a')\n",
    "data['s2'] = data['s2'].str.replace('â', 'a')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('Â', 'A')\n",
    "data['s2'] = data['s2'].str.replace('Â', 'A')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('Ã', 'A')\n",
    "data['s2'] = data['s2'].str.replace('Ã', 'A')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('_', ' ')\n",
    "data['s2'] = data['s2'].str.replace('_', ' ')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('µ', 'u')\n",
    "data['s2'] = data['s2'].str.replace('µ', 'u')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('³', ' ')\n",
    "data['s2'] = data['s2'].str.replace('³', ' ')\n",
    "\n",
    "data['s1'] = data['s1'].str.replace('½', ' ')\n",
    "data['s2'] = data['s2'].str.replace('½', ' ')\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ad9b4f5",
   "metadata": {},
   "source": [
    "## 合并连续空格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "76aa07ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother whom he called the ...</td>\n",
       "      <td>Referring to him as only the witness Amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick s before selling the ch...</td>\n",
       "      <td>Yucaipa bought Dominick s in 1995 for 693 mill...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 the ship s owners had published an ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT Tab shares were up 19 cents or...</td>\n",
       "      <td>Tab shares jumped 20 cents or 4 6 to set a rec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose 2 11 or about 11 percent to clo...</td>\n",
       "      <td>PG E Corp shares jumped 1 63 or 8 percent to 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez he co...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station a s US owners AES ...</td>\n",
       "      <td>The news comes after Drax s American owner AES...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother whom he called the ...   \n",
       "1        0  Yucaipa owned Dominick s before selling the ch...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT Tab shares were up 19 cents or...   \n",
       "4        1  The stock rose 2 11 or about 11 percent to clo...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez he co...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station a s US owners AES ...   \n",
       "5800     1  Sobig F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only the witness Amrozi ac...  \n",
       "1     Yucaipa bought Dominick s in 1995 for 693 mill...  \n",
       "2     On June 10 the ship s owners had published an ...  \n",
       "3     Tab shares jumped 20 cents or 4 6 to set a rec...  \n",
       "4     PG E Corp shares jumped 1 63 or 8 percent to 2...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany who remained in stable condition after...  \n",
       "5799  The news comes after Drax s American owner AES...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第14章/合并连续的空格\n",
    "\n",
    "def merge_consecutive_spaces(x):\n",
    "    return re.sub(r\"\\s{2,}\", \" \", x)\n",
    "\n",
    "data['s1'] = data['s1'].apply(merge_consecutive_spaces)\n",
    "data['s2'] = data['s2'].apply(merge_consecutive_spaces)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12887c94",
   "metadata": {},
   "source": [
    "## 隔开数字与单词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8fece36a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Amrozi accused his brother whom he called the ...</td>\n",
       "      <td>Referring to him as only the witness Amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Yucaipa owned Dominick s before selling the ch...</td>\n",
       "      <td>Yucaipa bought Dominick s in 1995 for 693 mill...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10 the ship s owners had published an ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>Around 0335 GMT Tab shares were up 19 cents or...</td>\n",
       "      <td>Tab shares jumped 20 cents or 4 6 to set a rec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>The stock rose 2 11 or about 11 percent to clo...</td>\n",
       "      <td>PG E Corp shares jumped 1 63 or 8 percent to 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>After Hughes refused to rehire Hernandez he co...</td>\n",
       "      <td>Hernandez filed an Equal Employment Opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>There are 103 Democrats in the Assembly and 47...</td>\n",
       "      <td>Democrats dominate the Assembly while Republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Bethany Hamilton remained in stable condition ...</td>\n",
       "      <td>Bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>Last week the power station a s US owners AES ...</td>\n",
       "      <td>The news comes after Drax s American owner AES...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>Sobig F spreads when unsuspecting computer use...</td>\n",
       "      <td>The virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  Amrozi accused his brother whom he called the ...   \n",
       "1        0  Yucaipa owned Dominick s before selling the ch...   \n",
       "2        1  They had published an advertisement on the Int...   \n",
       "3        0  Around 0335 GMT Tab shares were up 19 cents or...   \n",
       "4        1  The stock rose 2 11 or about 11 percent to clo...   \n",
       "...    ...                                                ...   \n",
       "5796     0  After Hughes refused to rehire Hernandez he co...   \n",
       "5797     0  There are 103 Democrats in the Assembly and 47...   \n",
       "5798     0  Bethany Hamilton remained in stable condition ...   \n",
       "5799     1  Last week the power station a s US owners AES ...   \n",
       "5800     1  Sobig F spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     Referring to him as only the witness Amrozi ac...  \n",
       "1     Yucaipa bought Dominick s in 1995 for 693 mill...  \n",
       "2     On June 10 the ship s owners had published an ...  \n",
       "3     Tab shares jumped 20 cents or 4 6 to set a rec...  \n",
       "4     PG E Corp shares jumped 1 63 or 8 percent to 2...  \n",
       "...                                                 ...  \n",
       "5796  Hernandez filed an Equal Employment Opportunit...  \n",
       "5797  Democrats dominate the Assembly while Republic...  \n",
       "5798  Bethany who remained in stable condition after...  \n",
       "5799  The news comes after Drax s American owner AES...  \n",
       "5800  The virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第14章/拆分数字和字母连写的词\n",
    "\n",
    "\n",
    "def splitting_words(x):\n",
    "    x = re.sub(\"(\\d)([a-zA-Z])\", \"\\\\1 \\\\2\", x)\n",
    "    x = re.sub(\"([a-zA-Z])(\\d)\", \"\\\\1 \\\\2\", x)\n",
    "    return x\n",
    "\n",
    "data['s1'] = data['s1'].apply(splitting_words)\n",
    "data['s2'] = data['s2'].apply(splitting_words)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d69f7c3",
   "metadata": {},
   "source": [
    "## 字母全部小写"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "09b4166c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>amrozi accused his brother whom he called the ...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>yucaipa owned dominick s before selling the ch...</td>\n",
       "      <td>yucaipa bought dominick s in 1995 for 693 mill...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>they had published an advertisement on the int...</td>\n",
       "      <td>on june 10 the ship s owners had published an ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>around 0335 gmt tab shares were up 19 cents or...</td>\n",
       "      <td>tab shares jumped 20 cents or 4 6 to set a rec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>the stock rose 2 11 or about 11 percent to clo...</td>\n",
       "      <td>pg e corp shares jumped 1 63 or 8 percent to 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>after hughes refused to rehire hernandez he co...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>there are 103 democrats in the assembly and 47...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>bethany hamilton remained in stable condition ...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>last week the power station a s us owners aes ...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>sobig f spreads when unsuspecting computer use...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  amrozi accused his brother whom he called the ...   \n",
       "1        0  yucaipa owned dominick s before selling the ch...   \n",
       "2        1  they had published an advertisement on the int...   \n",
       "3        0  around 0335 gmt tab shares were up 19 cents or...   \n",
       "4        1  the stock rose 2 11 or about 11 percent to clo...   \n",
       "...    ...                                                ...   \n",
       "5796     0  after hughes refused to rehire hernandez he co...   \n",
       "5797     0  there are 103 democrats in the assembly and 47...   \n",
       "5798     0  bethany hamilton remained in stable condition ...   \n",
       "5799     1  last week the power station a s us owners aes ...   \n",
       "5800     1  sobig f spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     referring to him as only the witness amrozi ac...  \n",
       "1     yucaipa bought dominick s in 1995 for 693 mill...  \n",
       "2     on june 10 the ship s owners had published an ...  \n",
       "3     tab shares jumped 20 cents or 4 6 to set a rec...  \n",
       "4     pg e corp shares jumped 1 63 or 8 percent to 2...  \n",
       "...                                                 ...  \n",
       "5796  hernandez filed an equal employment opportunit...  \n",
       "5797  democrats dominate the assembly while republic...  \n",
       "5798  bethany who remained in stable condition after...  \n",
       "5799  the news comes after drax s american owner aes...  \n",
       "5800  the virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/删除首尾空格并小写所有字母\n",
    "data['s1'] = data['s1'].str.strip()\n",
    "data['s2'] = data['s2'].str.strip()\n",
    "\n",
    "data['s1'] = data['s1'].str.lower()\n",
    "data['s2'] = data['s2'].str.lower()\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08e4b6c5",
   "metadata": {},
   "source": [
    "## 替换数字为符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "54e43e10",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>amrozi accused his brother whom he called the ...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>yucaipa owned dominick s before selling the ch...</td>\n",
       "      <td>yucaipa bought dominick s in &lt;NUM&gt; for &lt;NUM&gt; m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>they had published an advertisement on the int...</td>\n",
       "      <td>on june &lt;NUM&gt; the ship s owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>around &lt;NUM&gt; gmt tab shares were up &lt;NUM&gt; cent...</td>\n",
       "      <td>tab shares jumped &lt;NUM&gt; cents or &lt;NUM&gt; &lt;NUM&gt; t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM&gt; perc...</td>\n",
       "      <td>pg e corp shares jumped &lt;NUM&gt; &lt;NUM&gt; or &lt;NUM&gt; p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>after hughes refused to rehire hernandez he co...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>there are &lt;NUM&gt; democrats in the assembly and ...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>bethany hamilton remained in stable condition ...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>last week the power station a s us owners aes ...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>sobig f spreads when unsuspecting computer use...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  amrozi accused his brother whom he called the ...   \n",
       "1        0  yucaipa owned dominick s before selling the ch...   \n",
       "2        1  they had published an advertisement on the int...   \n",
       "3        0  around <NUM> gmt tab shares were up <NUM> cent...   \n",
       "4        1  the stock rose <NUM> <NUM> or about <NUM> perc...   \n",
       "...    ...                                                ...   \n",
       "5796     0  after hughes refused to rehire hernandez he co...   \n",
       "5797     0  there are <NUM> democrats in the assembly and ...   \n",
       "5798     0  bethany hamilton remained in stable condition ...   \n",
       "5799     1  last week the power station a s us owners aes ...   \n",
       "5800     1  sobig f spreads when unsuspecting computer use...   \n",
       "\n",
       "                                                     s2  \n",
       "0     referring to him as only the witness amrozi ac...  \n",
       "1     yucaipa bought dominick s in <NUM> for <NUM> m...  \n",
       "2     on june <NUM> the ship s owners had published ...  \n",
       "3     tab shares jumped <NUM> cents or <NUM> <NUM> t...  \n",
       "4     pg e corp shares jumped <NUM> <NUM> or <NUM> p...  \n",
       "...                                                 ...  \n",
       "5796  hernandez filed an equal employment opportunit...  \n",
       "5797  democrats dominate the assembly while republic...  \n",
       "5798  bethany who remained in stable condition after...  \n",
       "5799  the news comes after drax s american owner aes...  \n",
       "5800  the virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第14章/替换数字为符号\n",
    "def replace_numbers(x):\n",
    "    return re.sub(r\"\\d+\", \"<NUM>\", x)\n",
    "\n",
    "data['s1'] = data['s1'].apply(replace_numbers)\n",
    "data['s2'] = data['s2'].apply(replace_numbers)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da468e28",
   "metadata": {},
   "source": [
    "## 添加首尾符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6ab75669",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "      <td>yucaipa bought dominick s in &lt;NUM&gt; for &lt;NUM&gt; m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "      <td>on june &lt;NUM&gt; the ship s owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "      <td>tab shares jumped &lt;NUM&gt; cents or &lt;NUM&gt; &lt;NUM&gt; t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "      <td>pg e corp shares jumped &lt;NUM&gt; &lt;NUM&gt; or &lt;NUM&gt; p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  <SOS> amrozi accused his brother whom he calle...   \n",
       "1        0  <SOS> yucaipa owned dominick s before selling ...   \n",
       "2        1  <SOS> they had published an advertisement on t...   \n",
       "3        0  <SOS> around <NUM> gmt tab shares were up <NUM...   \n",
       "4        1  <SOS> the stock rose <NUM> <NUM> or about <NUM...   \n",
       "...    ...                                                ...   \n",
       "5796     0  <SOS> after hughes refused to rehire hernandez...   \n",
       "5797     0  <SOS> there are <NUM> democrats in the assembl...   \n",
       "5798     0  <SOS> bethany hamilton remained in stable cond...   \n",
       "5799     1  <SOS> last week the power station a s us owner...   \n",
       "5800     1  <SOS> sobig f spreads when unsuspecting comput...   \n",
       "\n",
       "                                                     s2  \n",
       "0     referring to him as only the witness amrozi ac...  \n",
       "1     yucaipa bought dominick s in <NUM> for <NUM> m...  \n",
       "2     on june <NUM> the ship s owners had published ...  \n",
       "3     tab shares jumped <NUM> cents or <NUM> <NUM> t...  \n",
       "4     pg e corp shares jumped <NUM> <NUM> or <NUM> p...  \n",
       "...                                                 ...  \n",
       "5796  hernandez filed an equal employment opportunit...  \n",
       "5797  democrats dominate the assembly while republic...  \n",
       "5798  bethany who remained in stable condition after...  \n",
       "5799  the news comes after drax s american owner aes...  \n",
       "5800  the virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/为s1添加首尾符号\n",
    "def f(sent):\n",
    "    return '<SOS> ' + sent + ' <EOS>'\n",
    "\n",
    "\n",
    "data['s1'] = data['s1'].apply(f)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "31c3bfc1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "      <td>yucaipa bought dominick s in &lt;NUM&gt; for &lt;NUM&gt; m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "      <td>on june &lt;NUM&gt; the ship s owners had published ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "      <td>tab shares jumped &lt;NUM&gt; cents or &lt;NUM&gt; &lt;NUM&gt; t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "      <td>pg e corp shares jumped &lt;NUM&gt; &lt;NUM&gt; or &lt;NUM&gt; p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  <SOS> amrozi accused his brother whom he calle...   \n",
       "1        0  <SOS> yucaipa owned dominick s before selling ...   \n",
       "2        1  <SOS> they had published an advertisement on t...   \n",
       "3        0  <SOS> around <NUM> gmt tab shares were up <NUM...   \n",
       "4        1  <SOS> the stock rose <NUM> <NUM> or about <NUM...   \n",
       "...    ...                                                ...   \n",
       "5796     0  <SOS> after hughes refused to rehire hernandez...   \n",
       "5797     0  <SOS> there are <NUM> democrats in the assembl...   \n",
       "5798     0  <SOS> bethany hamilton remained in stable cond...   \n",
       "5799     1  <SOS> last week the power station a s us owner...   \n",
       "5800     1  <SOS> sobig f spreads when unsuspecting comput...   \n",
       "\n",
       "                                                     s2  \n",
       "0     referring to him as only the witness amrozi ac...  \n",
       "1     yucaipa bought dominick s in <NUM> for <NUM> m...  \n",
       "2     on june <NUM> the ship s owners had published ...  \n",
       "3     tab shares jumped <NUM> cents or <NUM> <NUM> t...  \n",
       "4     pg e corp shares jumped <NUM> <NUM> or <NUM> p...  \n",
       "...                                                 ...  \n",
       "5796  hernandez filed an equal employment opportunit...  \n",
       "5797  democrats dominate the assembly while republic...  \n",
       "5798  bethany who remained in stable condition after...  \n",
       "5799  the news comes after drax s american owner aes...  \n",
       "5800  the virus spreads when unsuspecting computer u...  \n",
       "\n",
       "[5801 rows x 3 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/为s2添加首尾符号\n",
    "def f(sent):\n",
    "    return sent + ' <EOS>'\n",
    "\n",
    "\n",
    "data['s2'] = data['s2'].apply(f)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4ba6a0d",
   "metadata": {},
   "source": [
    "## 求出句子长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "49f4c4b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "      <th>s1_lens</th>\n",
       "      <th>s2_lens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "      <td>yucaipa bought dominick s in &lt;NUM&gt; for &lt;NUM&gt; m...</td>\n",
       "      <td>18</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "      <td>on june &lt;NUM&gt; the ship s owners had published ...</td>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "      <td>tab shares jumped &lt;NUM&gt; cents or &lt;NUM&gt; &lt;NUM&gt; t...</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "      <td>pg e corp shares jumped &lt;NUM&gt; &lt;NUM&gt; or &lt;NUM&gt; p...</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "      <td>14</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "      <td>29</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "      <td>28</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  <SOS> amrozi accused his brother whom he calle...   \n",
       "1        0  <SOS> yucaipa owned dominick s before selling ...   \n",
       "2        1  <SOS> they had published an advertisement on t...   \n",
       "3        0  <SOS> around <NUM> gmt tab shares were up <NUM...   \n",
       "4        1  <SOS> the stock rose <NUM> <NUM> or about <NUM...   \n",
       "...    ...                                                ...   \n",
       "5796     0  <SOS> after hughes refused to rehire hernandez...   \n",
       "5797     0  <SOS> there are <NUM> democrats in the assembl...   \n",
       "5798     0  <SOS> bethany hamilton remained in stable cond...   \n",
       "5799     1  <SOS> last week the power station a s us owner...   \n",
       "5800     1  <SOS> sobig f spreads when unsuspecting comput...   \n",
       "\n",
       "                                                     s2  s1_lens  s2_lens  \n",
       "0     referring to him as only the witness amrozi ac...       16       17  \n",
       "1     yucaipa bought dominick s in <NUM> for <NUM> m...       18       21  \n",
       "2     on june <NUM> the ship s owners had published ...       20       20  \n",
       "3     tab shares jumped <NUM> cents or <NUM> <NUM> t...       28       19  \n",
       "4     pg e corp shares jumped <NUM> <NUM> or <NUM> p...       23       22  \n",
       "...                                                 ...      ...      ...  \n",
       "5796  hernandez filed an equal employment opportunit...       16       11  \n",
       "5797  democrats dominate the assembly while republic...       12       10  \n",
       "5798  bethany who remained in stable condition after...       14       17  \n",
       "5799  the news comes after drax s american owner aes...       29       30  \n",
       "5800  the virus spreads when unsuspecting computer u...       28       23  \n",
       "\n",
       "[5801 rows x 5 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/分别求出s1和s2的长度\n",
    "def f(sent):\n",
    "    return len(sent.split(' '))\n",
    "\n",
    "\n",
    "data['s1_lens'] = data['s1'].apply(f)\n",
    "data['s2_lens'] = data['s2'].apply(f)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "44300ac1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "72"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/求s1+s2最大长度\n",
    "max_lens = max(data['s1_lens'] + data['s2_lens'])\n",
    "\n",
    "max_lens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "86e6b129",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "      <th>s1_lens</th>\n",
       "      <th>s2_lens</th>\n",
       "      <th>pad_lens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "      <td>referring to him as only the witness amrozi ac...</td>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "      <td>yucaipa bought dominick s in &lt;NUM&gt; for &lt;NUM&gt; m...</td>\n",
       "      <td>18</td>\n",
       "      <td>21</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "      <td>on june &lt;NUM&gt; the ship s owners had published ...</td>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "      <td>tab shares jumped &lt;NUM&gt; cents or &lt;NUM&gt; &lt;NUM&gt; t...</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "      <td>pg e corp shares jumped &lt;NUM&gt; &lt;NUM&gt; or &lt;NUM&gt; p...</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "      <td>hernandez filed an equal employment opportunit...</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "      <td>democrats dominate the assembly while republic...</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "      <td>bethany who remained in stable condition after...</td>\n",
       "      <td>14</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "      <td>the news comes after drax s american owner aes...</td>\n",
       "      <td>29</td>\n",
       "      <td>30</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "      <td>the virus spreads when unsuspecting computer u...</td>\n",
       "      <td>28</td>\n",
       "      <td>23</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same                                                 s1  \\\n",
       "0        1  <SOS> amrozi accused his brother whom he calle...   \n",
       "1        0  <SOS> yucaipa owned dominick s before selling ...   \n",
       "2        1  <SOS> they had published an advertisement on t...   \n",
       "3        0  <SOS> around <NUM> gmt tab shares were up <NUM...   \n",
       "4        1  <SOS> the stock rose <NUM> <NUM> or about <NUM...   \n",
       "...    ...                                                ...   \n",
       "5796     0  <SOS> after hughes refused to rehire hernandez...   \n",
       "5797     0  <SOS> there are <NUM> democrats in the assembl...   \n",
       "5798     0  <SOS> bethany hamilton remained in stable cond...   \n",
       "5799     1  <SOS> last week the power station a s us owner...   \n",
       "5800     1  <SOS> sobig f spreads when unsuspecting comput...   \n",
       "\n",
       "                                                     s2  s1_lens  s2_lens  \\\n",
       "0     referring to him as only the witness amrozi ac...       16       17   \n",
       "1     yucaipa bought dominick s in <NUM> for <NUM> m...       18       21   \n",
       "2     on june <NUM> the ship s owners had published ...       20       20   \n",
       "3     tab shares jumped <NUM> cents or <NUM> <NUM> t...       28       19   \n",
       "4     pg e corp shares jumped <NUM> <NUM> or <NUM> p...       23       22   \n",
       "...                                                 ...      ...      ...   \n",
       "5796  hernandez filed an equal employment opportunit...       16       11   \n",
       "5797  democrats dominate the assembly while republic...       12       10   \n",
       "5798  bethany who remained in stable condition after...       14       17   \n",
       "5799  the news comes after drax s american owner aes...       29       30   \n",
       "5800  the virus spreads when unsuspecting computer u...       28       23   \n",
       "\n",
       "      pad_lens  \n",
       "0           39  \n",
       "1           33  \n",
       "2           32  \n",
       "3           25  \n",
       "4           27  \n",
       "...        ...  \n",
       "5796        45  \n",
       "5797        50  \n",
       "5798        41  \n",
       "5799        13  \n",
       "5800        21  \n",
       "\n",
       "[5801 rows x 6 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/求出每个句子需要PAD的长度\n",
    "data['pad_lens'] = max_lens - data['s1_lens'] - data['s2_lens']\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4e9c216",
   "metadata": {},
   "source": [
    "## 合并两句子"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0a0a382a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1_lens</th>\n",
       "      <th>s2_lens</th>\n",
       "      <th>pad_lens</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>39</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "      <td>21</td>\n",
       "      <td>33</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>32</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>27</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>45</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "      <td>50</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>30</td>\n",
       "      <td>13</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>28</td>\n",
       "      <td>23</td>\n",
       "      <td>21</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same  s1_lens  s2_lens  pad_lens  \\\n",
       "0        1       16       17        39   \n",
       "1        0       18       21        33   \n",
       "2        1       20       20        32   \n",
       "3        0       28       19        25   \n",
       "4        1       23       22        27   \n",
       "...    ...      ...      ...       ...   \n",
       "5796     0       16       11        45   \n",
       "5797     0       12       10        50   \n",
       "5798     0       14       17        41   \n",
       "5799     1       29       30        13   \n",
       "5800     1       28       23        21   \n",
       "\n",
       "                                                   sent  \n",
       "0     <SOS> amrozi accused his brother whom he calle...  \n",
       "1     <SOS> yucaipa owned dominick s before selling ...  \n",
       "2     <SOS> they had published an advertisement on t...  \n",
       "3     <SOS> around <NUM> gmt tab shares were up <NUM...  \n",
       "4     <SOS> the stock rose <NUM> <NUM> or about <NUM...  \n",
       "...                                                 ...  \n",
       "5796  <SOS> after hughes refused to rehire hernandez...  \n",
       "5797  <SOS> there are <NUM> democrats in the assembl...  \n",
       "5798  <SOS> bethany hamilton remained in stable cond...  \n",
       "5799  <SOS> last week the power station a s us owner...  \n",
       "5800  <SOS> sobig f spreads when unsuspecting comput...  \n",
       "\n",
       "[5801 rows x 5 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/合并s1和s2\n",
    "data['sent'] = data['s1'] + ' ' + data['s2']\n",
    "\n",
    "data.pop('s1')\n",
    "data.pop('s2')\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70ef36d6",
   "metadata": {},
   "source": [
    "## 填充 PAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a5ec9206",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1_lens</th>\n",
       "      <th>s2_lens</th>\n",
       "      <th>pad_lens</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>39</td>\n",
       "      <td>&lt;SOS&gt; amrozi accused his brother whom he calle...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "      <td>21</td>\n",
       "      <td>33</td>\n",
       "      <td>&lt;SOS&gt; yucaipa owned dominick s before selling ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>32</td>\n",
       "      <td>&lt;SOS&gt; they had published an advertisement on t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>&lt;SOS&gt; around &lt;NUM&gt; gmt tab shares were up &lt;NUM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>27</td>\n",
       "      <td>&lt;SOS&gt; the stock rose &lt;NUM&gt; &lt;NUM&gt; or about &lt;NUM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>45</td>\n",
       "      <td>&lt;SOS&gt; after hughes refused to rehire hernandez...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "      <td>50</td>\n",
       "      <td>&lt;SOS&gt; there are &lt;NUM&gt; democrats in the assembl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>&lt;SOS&gt; bethany hamilton remained in stable cond...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>30</td>\n",
       "      <td>13</td>\n",
       "      <td>&lt;SOS&gt; last week the power station a s us owner...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>28</td>\n",
       "      <td>23</td>\n",
       "      <td>21</td>\n",
       "      <td>&lt;SOS&gt; sobig f spreads when unsuspecting comput...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same  s1_lens  s2_lens  pad_lens  \\\n",
       "0        1       16       17        39   \n",
       "1        0       18       21        33   \n",
       "2        1       20       20        32   \n",
       "3        0       28       19        25   \n",
       "4        1       23       22        27   \n",
       "...    ...      ...      ...       ...   \n",
       "5796     0       16       11        45   \n",
       "5797     0       12       10        50   \n",
       "5798     0       14       17        41   \n",
       "5799     1       29       30        13   \n",
       "5800     1       28       23        21   \n",
       "\n",
       "                                                   sent  \n",
       "0     <SOS> amrozi accused his brother whom he calle...  \n",
       "1     <SOS> yucaipa owned dominick s before selling ...  \n",
       "2     <SOS> they had published an advertisement on t...  \n",
       "3     <SOS> around <NUM> gmt tab shares were up <NUM...  \n",
       "4     <SOS> the stock rose <NUM> <NUM> or about <NUM...  \n",
       "...                                                 ...  \n",
       "5796  <SOS> after hughes refused to rehire hernandez...  \n",
       "5797  <SOS> there are <NUM> democrats in the assembl...  \n",
       "5798  <SOS> bethany hamilton remained in stable cond...  \n",
       "5799  <SOS> last week the power station a s us owner...  \n",
       "5800  <SOS> sobig f spreads when unsuspecting comput...  \n",
       "\n",
       "[5801 rows x 5 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/为不足最大长度的句子补充PAD\n",
    "def f(row):\n",
    "    pad = ' '.join(['<PAD>'] * row['pad_lens'])\n",
    "    row['sent'] = row['sent'] + ' ' + pad\n",
    "    return row\n",
    "\n",
    "\n",
    "data = data.apply(f, axis=1)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19980829",
   "metadata": {},
   "source": [
    "## 构建字典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d032783d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14789, 18)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/构建字典\n",
    "def build_vocab():\n",
    "    vocab = {\n",
    "        '<PAD>': 0,\n",
    "        '<SOS>': 1,\n",
    "        '<EOS>': 2,\n",
    "        '<NUM>': 3,\n",
    "        '<UNK>': 4,\n",
    "        '<MASK>': 5,\n",
    "        '<Symbol6>': 6,\n",
    "        '<Symbol7>': 7,\n",
    "        '<Symbol8>': 8,\n",
    "        '<Symbol9>': 9,\n",
    "        '<Symbol10>': 10,\n",
    "    }\n",
    "\n",
    "    for i in range(len(data)):\n",
    "        for word in data.iloc[i]['sent'].split(' '):\n",
    "            if word not in vocab:\n",
    "                vocab[word] = len(vocab)\n",
    "\n",
    "    return vocab\n",
    "\n",
    "\n",
    "vocab = build_vocab()\n",
    "\n",
    "len(vocab), vocab['the']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1764851",
   "metadata": {},
   "source": [
    "## 使用字典编码文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f6370262",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>same</th>\n",
       "      <th>s1_lens</th>\n",
       "      <th>s2_lens</th>\n",
       "      <th>pad_lens</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>39</td>\n",
       "      <td>1,11,12,13,14,15,16,17,18,19,20,21,22,13,23,2,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "      <td>21</td>\n",
       "      <td>33</td>\n",
       "      <td>1,29,30,31,32,33,34,18,35,25,36,37,3,38,3,3,39...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>32</td>\n",
       "      <td>1,45,46,47,48,49,50,18,51,50,52,3,53,18,54,38,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>1,60,3,61,62,63,64,65,3,66,67,3,3,68,69,3,3,70...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>27</td>\n",
       "      <td>1,18,77,78,3,3,67,79,3,80,25,81,82,68,3,3,50,1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>45</td>\n",
       "      <td>1,427,1645,2006,25,10152,2246,16,14787,25,18,1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "      <td>50</td>\n",
       "      <td>1,514,448,3,1756,37,18,4646,42,3,1755,2,1756,1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>1,10028,994,2211,37,1627,2190,1672,427,18,1167...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5799</th>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>30</td>\n",
       "      <td>13</td>\n",
       "      <td>1,464,908,18,917,434,69,32,586,58,9275,88,3184...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5800</th>\n",
       "      <td>1</td>\n",
       "      <td>28</td>\n",
       "      <td>23</td>\n",
       "      <td>21</td>\n",
       "      <td>1,2808,2809,2799,205,2800,2801,1573,1658,1243,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5801 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      same  s1_lens  s2_lens  pad_lens  \\\n",
       "0        1       16       17        39   \n",
       "1        0       18       21        33   \n",
       "2        1       20       20        32   \n",
       "3        0       28       19        25   \n",
       "4        1       23       22        27   \n",
       "...    ...      ...      ...       ...   \n",
       "5796     0       16       11        45   \n",
       "5797     0       12       10        50   \n",
       "5798     0       14       17        41   \n",
       "5799     1       29       30        13   \n",
       "5800     1       28       23        21   \n",
       "\n",
       "                                                   sent  \n",
       "0     1,11,12,13,14,15,16,17,18,19,20,21,22,13,23,2,...  \n",
       "1     1,29,30,31,32,33,34,18,35,25,36,37,3,38,3,3,39...  \n",
       "2     1,45,46,47,48,49,50,18,51,50,52,3,53,18,54,38,...  \n",
       "3     1,60,3,61,62,63,64,65,3,66,67,3,3,68,69,3,3,70...  \n",
       "4     1,18,77,78,3,3,67,79,3,80,25,81,82,68,3,3,50,1...  \n",
       "...                                                 ...  \n",
       "5796  1,427,1645,2006,25,10152,2246,16,14787,25,18,1...  \n",
       "5797  1,514,448,3,1756,37,18,4646,42,3,1755,2,1756,1...  \n",
       "5798  1,10028,994,2211,37,1627,2190,1672,427,18,1167...  \n",
       "5799  1,464,908,18,917,434,69,32,586,58,9275,88,3184...  \n",
       "5800  1,2808,2809,2799,205,2800,2801,1573,1658,1243,...  \n",
       "\n",
       "[5801 rows x 5 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第14章/使用字典编码文本\n",
    "def f(sent):\n",
    "    sent = [str(vocab[word]) for word in sent.split()]\n",
    "    sent = ','.join(sent)\n",
    "    return sent\n",
    "\n",
    "\n",
    "data['sent'] = data['sent'].apply(f)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3534b1e",
   "metadata": {},
   "source": [
    "## 保存处理结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e386fb28",
   "metadata": {},
   "outputs": [],
   "source": [
    "#第14章/保存为csv文件\n",
    "data.to_csv('data/msr_paraphrase_data.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "760b612d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#第14章/保存字典\n",
    "pd.DataFrame(vocab.items(), columns=['word', 'token']).to_csv('data/msr_paraphrase_vocab.csv',\n",
    "                                                              index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
